mergedata <- read.csv("merged.csv")

colnames(mergedata)
##  [1] "Country"               "Year"                  "Max_Partners"         
##  [4] "GDP_per_unit_CO2"      "PPP_Conv_Rate"         "PPP_Share_GDP"        
##  [7] "Imports_PC"            "Exports_PC"            "Govt_Revenue"         
## [10] "gdp_per_cap"           "agri_perc_gdp"         "agg.empl.agri.perc"   
## [13] "rural.pop.perc"        "pop.tot"               "mobilesub_per100peeps"
## [16] "intl_tourist_arrival"  "total_life_exp"        "life_expectancy_fe"   
## [19] "life_exp_male"         "trade_perGDP"
nrow(mergedata)
## [1] 2673
ncol(mergedata)
## [1] 20
# find missing-value count in each column
# confirming if Nicole has already cleaned the data
summary(mergedata)
##       Country          Year       Max_Partners   GDP_per_unit_CO2 
##  Albania  :  27   Min.   :1990   Min.   : 45.0   Min.   : 0.5535  
##  Algeria  :  27   1st Qu.:1996   1st Qu.:136.0   1st Qu.: 2.9293  
##  Argentina:  27   Median :2003   Median :172.0   Median : 4.3336  
##  Armenia  :  27   Mean   :2003   Mean   :168.3   Mean   : 5.1342  
##  Australia:  27   3rd Qu.:2010   3rd Qu.:205.0   3rd Qu.: 6.2800  
##  Austria  :  27   Max.   :2016   Max.   :235.0   Max.   :39.4786  
##  (Other)  :2511                                                   
##  PPP_Conv_Rate      PPP_Share_GDP       Imports_PC        Exports_PC     
##  Min.   :   0.001   Min.   : 0.0080   Min.   :-66.872   Min.   :-62.400  
##  1st Qu.:   0.672   1st Qu.: 0.0570   1st Qu.:  0.732   1st Qu.:  0.754  
##  Median :   1.693   Median : 0.2270   Median :  6.024   Median :  5.374  
##  Mean   :  86.197   Mean   : 0.9433   Mean   :  6.592   Mean   :  5.791  
##  3rd Qu.:  15.440   3rd Qu.: 0.6810   3rd Qu.: 12.400   3rd Qu.: 10.526  
##  Max.   :4085.960   Max.   :21.7800   Max.   :112.650   Max.   : 93.959  
##                                                                          
##   Govt_Revenue        gdp_per_cap       agri_perc_gdp    agg.empl.agri.perc
##  Min.   :-29.88300   Min.   :   164.3   Min.   : 0.214   Min.   : 0.125    
##  1st Qu.: -2.01900   1st Qu.:  2425.6   1st Qu.: 2.672   1st Qu.: 5.337    
##  Median : -0.16800   Median :  7019.2   Median : 6.467   Median :15.695    
##  Mean   : -0.01627   Mean   : 16052.6   Mean   : 9.791   Mean   :22.405    
##  3rd Qu.:  1.96500   3rd Qu.: 24495.7   3rd Qu.:13.787   3rd Qu.:36.700    
##  Max.   : 36.01400   Max.   :111968.4   Max.   :63.831   Max.   :84.774    
##                                                                            
##  rural.pop.perc      pop.tot          mobilesub_per100peeps
##  Min.   : 2.081   Min.   :2.548e+05   Min.   :  0.000      
##  1st Qu.:21.992   1st Qu.:4.580e+06   1st Qu.:  1.437      
##  Median :34.191   Median :1.042e+07   Median : 34.993      
##  Mean   :36.904   Mean   :5.424e+07   Mean   : 52.295      
##  3rd Qu.:49.272   3rd Qu.:3.854e+07   3rd Qu.:101.254      
##  Max.   :87.379   Max.   :1.379e+09   Max.   :212.639      
##                                                            
##  intl_tourist_arrival total_life_exp  life_expectancy_fe life_exp_male  
##  Min.   :   12000     Min.   :43.41   Min.   :45.61      Min.   :41.38  
##  1st Qu.:  739000     1st Qu.:68.97   1st Qu.:72.45      1st Qu.:65.47  
##  Median : 2152000     Median :73.21   Median :76.37      Median :70.31  
##  Mean   : 7075404     Mean   :71.67   Mean   :74.55      Mean   :68.90  
##  3rd Qu.: 7045000     3rd Qu.:77.17   3rd Qu.:80.19      3rd Qu.:74.50  
##  Max.   :84452000     Max.   :83.98   Max.   :87.14      Max.   :81.70  
##                                                                         
##   trade_perGDP   
##  Min.   : 13.75  
##  1st Qu.: 51.67  
##  Median : 70.85  
##  Mean   : 80.51  
##  3rd Qu.: 98.19  
##  Max.   :408.36  
## 
# good. The data is cleaned. We can proceed to the next stage of the analysis. 

DATA EXPLORATORY ANALYSIS

## DATA EXPLORATORY ANALYSIS: 
head(mergedata)
##   Country Year Max_Partners GDP_per_unit_CO2 PPP_Conv_Rate PPP_Share_GDP
## 1 Albania 1990           75         2.504851         2.117         0.035
## 2 Albania 1991           75         2.684573         2.775         0.024
## 3 Albania 1992           75         4.443426         9.488         0.020
## 4 Albania 1993           75         5.264840        19.912         0.022
## 5 Albania 1994           75         5.542105        26.714         0.023
## 6 Albania 1995           75         6.905429        28.740         0.024
##   Imports_PC Exports_PC Govt_Revenue gdp_per_cap agri_perc_gdp
## 1          0          0       -6.424    1838.673       36.4107
## 2          0          0       -6.424    1331.809       36.4107
## 3          0          0       -6.424    1243.609       36.4107
## 4          0          0       -6.424    1370.830       36.4107
## 5          0          0       -6.424    1493.790       36.4107
## 6          0          0       -6.424    1703.287       36.4107
##   agg.empl.agri.perc rural.pop.perc pop.tot mobilesub_per100peeps
## 1             55.914         63.572 3286542                     0
## 2             55.914         63.300 3266790                     0
## 3             56.134         62.751 3247039                     0
## 4             55.470         62.201 3227287                     0
## 5             54.841         61.646 3207536                     0
## 6             54.257         61.089 3187784                     0
##   intl_tourist_arrival total_life_exp life_expectancy_fe life_exp_male
## 1              1062000         71.836             74.991        69.070
## 2              1062000         71.803             74.980        69.017
## 3              1062000         71.802             74.985        68.997
## 4              1062000         71.860             75.039        69.037
## 5              1062000         71.992             75.158        69.150
## 6              1062000         72.205             75.352        69.347
##   trade_perGDP
## 1     39.43696
## 2     36.07052
## 3    108.78547
## 4     80.51833
## 5     53.10258
## 6     47.61059
summary(mergedata$Max_Partners)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    45.0   136.0   172.0   168.3   205.0   235.0
# find top 10, middle 10, and bottom 10 with respect to the trading partners (Max_Partners var) in 2016
######################################
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ readr   1.3.1
## ✔ tibble  2.1.3     ✔ purrr   0.3.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ ggplot2 3.2.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
#####################################

mergedata_2016 <- subset(mergedata, Year == 2016, select = c(Country,Max_Partners, Year))
mergedata_1990 <- subset(mergedata, Year == 1990, select = c(Country,Max_Partners, Year))

# top n by country for Max_Partners
top10_2016 <- mergedata %>%
  filter(Year == 2016) %>%
  top_n(n = 10, wt = Max_Partners)

top10_1990 <- mergedata %>%
  filter(Year == 1990) %>%
  top_n(n = 10, wt = Max_Partners)

top10_1997 <- mergedata %>%
  filter(Year == 1997) %>%
  top_n(n = 10, wt = Max_Partners)

top10_2005 <- mergedata %>%
  filter(Year == 2005) %>%
  top_n(n = 10, wt = Max_Partners)

top10_2009 <- mergedata %>%
  filter(Year == 2009) %>%
  top_n(n = 10, wt = Max_Partners)

top10_2012 <- mergedata %>%
  filter(Year == 2012) %>%
  top_n(n = 10, wt = Max_Partners)

g90<- ggplot(data = top10_1990, mapping = aes(x = reorder(Country, Max_Partners), Max_Partners)) + 
  geom_bar(stat = "identity", width = 0.3) + coord_flip() +xlab("Country") + ylab("Number of Trading Partners")+ggtitle("Top 10 Countries in 1990")


g97<- ggplot(data = top10_1997, mapping = aes(x = reorder(Country, Max_Partners), Max_Partners)) + 
  geom_bar(stat = "identity", width = 0.3) + coord_flip() +xlab("Country") + ylab("Number of Trading Partners")+ggtitle("Top 10 Countries in 1997")

g05<- ggplot(data = top10_2005, mapping = aes(x = reorder(Country, Max_Partners), Max_Partners)) + 
  geom_bar(stat = "identity", width = 0.3) + coord_flip() +xlab("Country") + ylab("Number of Trading Partners")+ggtitle("Top 10 Countries in 2005")

g09<- ggplot(data = top10_2009, mapping = aes(x = reorder(Country, Max_Partners), Max_Partners)) + 
  geom_bar(stat = "identity", width = 0.3) + coord_flip() +xlab("Country") + ylab("Number of Trading Partners")+ggtitle("Top 10 Countries in 2009")

g12 <- ggplot(data = top10_2012, mapping = aes(x = reorder(Country, Max_Partners), Max_Partners)) + 
  geom_bar(stat = "identity", width = 0.3) + coord_flip() +xlab("Country") + ylab("Number of Trading Partners")+ggtitle("Top 10 Countries in 2012")

g16<- ggplot(data = top10_2016, mapping = aes(x = reorder(Country, Max_Partners), Max_Partners)) + 
  geom_bar(stat = "identity", width = 0.3) + coord_flip() +xlab("Country") + ylab("Number of Trading Partners")+ggtitle("Top 10 Countries in 2016")


#####################
library(ggpubr)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
#####################

ggarrange(g90,g97,g05,g09,g12,g16)

####################

bottom10_2016 <- mergedata %>%
  filter(Year == 2016) %>%
  top_n(n = -10, wt = Max_Partners) #top_n with neg value can be used to find the bottom values

bottom10_1990 <- mergedata %>%
  filter(Year == 1990) %>%
  top_n(n = -10, wt = Max_Partners)

bottom10_1997 <- mergedata %>%
  filter(Year == 1997) %>%
  top_n(n = -10, wt = Max_Partners)

bottom10_2005 <- mergedata %>%
  filter(Year == 2005) %>%
  top_n(n = -10, wt = Max_Partners)

bottom10_2009 <- mergedata %>%
  filter(Year == 2009) %>%
  top_n(n = -10, wt = Max_Partners)

bottom10_2012 <- mergedata %>%
  filter(Year == 2012) %>%
  top_n(n = -10, wt = Max_Partners)

b90<- ggplot(data = bottom10_1990, mapping = aes(x = reorder(Country, Max_Partners), Max_Partners)) + 
  geom_bar(stat = "identity", width = 0.3) + coord_flip() +xlab("Country") + ylab("Number of Trading Partners")+ggtitle("Lowest 10 Countries in 1990")


b97<- ggplot(data = bottom10_1997, mapping = aes(x = reorder(Country, Max_Partners), Max_Partners)) + 
  geom_bar(stat = "identity", width = 0.3) + coord_flip() +xlab("Country") + ylab("Number of Trading Partners")+ggtitle("Lowest 10 Countries in 1997")

b05<- ggplot(data = bottom10_2005, mapping = aes(x = reorder(Country, Max_Partners), Max_Partners)) + 
  geom_bar(stat = "identity", width = 0.3) + coord_flip() +xlab("Country") + ylab("Number of Trading Partners")+ggtitle("Lowest 10 Countries in 2005")

b09<- ggplot(data = bottom10_2009, mapping = aes(x = reorder(Country, Max_Partners), Max_Partners)) + 
  geom_bar(stat = "identity", width = 0.3) + coord_flip() +xlab("Country") + ylab("Number of Trading Partners")+ggtitle("Lowest 10 Countries in 2009")

b12 <- ggplot(data = bottom10_2012, mapping = aes(x = reorder(Country, Max_Partners), Max_Partners)) + 
  geom_bar(stat = "identity", width = 0.3) + coord_flip() +xlab("Country") + ylab("Number of Trading Partners")+ggtitle("Lowest 10 Countries in 2012")

b16<- ggplot(data = bottom10_2016, mapping = aes(x = reorder(Country, Max_Partners), Max_Partners)) + 
  geom_bar(stat = "identity", width = 0.3) + coord_flip() +xlab("Country") + ylab("Number of Trading Partners")+ggtitle("Lowest 10 Countries in 2016")

#######
ggarrange(b90,b97,b05,b09,b12,b16)

#######
# Average across the time span:
avg_partners <- mergedata %>% group_by(Country) %>% summarise_at(vars(Max_Partners),funs(mean(.,na.rm=TRUE)))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
avg_top10 <- avg_partners %>%
  top_n(n = 10, wt = Max_Partners)

avg_bottom10 <- avg_partners %>% top_n(n = -10, wt = Max_Partners)

avg10_top<- ggplot(data = avg_top10, mapping = aes(x = reorder(Country, Max_Partners), Max_Partners)) + 
  geom_bar(stat = "identity", width = 0.3) + coord_flip() +xlab("Country") + ylab("Avg Number of Trading Partners")+ggtitle("Top 10 Countries: 1990-2016")

avg10_bottom<- ggplot(data = avg_bottom10, mapping = aes(x = reorder(Country, Max_Partners), Max_Partners)) + 
  geom_bar(stat = "identity", width = 0.3) + coord_flip() +xlab("Country") + ylab("Avg Number of Trading Partners")+ggtitle("Lowest 10 Countries: 1990-2016")

ggarrange(avg10_bottom,avg10_top)

Correlation Matrix:

library(corrplot)
## corrplot 0.84 loaded
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
res <- cor(mergedata[,3:17])
round(res, 2)
##                       Max_Partners GDP_per_unit_CO2 PPP_Conv_Rate PPP_Share_GDP
## Max_Partners                  1.00            -0.16          0.04          0.33
## GDP_per_unit_CO2             -0.16             1.00          0.14         -0.17
## PPP_Conv_Rate                 0.04             0.14          1.00          0.00
## PPP_Share_GDP                 0.33            -0.17          0.00          1.00
## Imports_PC                   -0.02             0.01         -0.01          0.00
## Exports_PC                    0.00             0.01         -0.01          0.01
## Govt_Revenue                 -0.04             0.05          0.02         -0.05
## gdp_per_cap                   0.48            -0.12         -0.13          0.18
## agri_perc_gdp                -0.51             0.41          0.07         -0.16
## agg.empl.agri.perc           -0.45             0.46          0.10         -0.13
## rural.pop.perc               -0.33             0.39          0.04         -0.11
## pop.tot                       0.22            -0.10          0.04          0.58
## mobilesub_per100peeps         0.50            -0.06          0.03          0.01
## intl_tourist_arrival          0.49            -0.15         -0.08          0.61
## total_life_exp                0.46            -0.41         -0.05          0.15
##                       Imports_PC Exports_PC Govt_Revenue gdp_per_cap
## Max_Partners               -0.02       0.00        -0.04        0.48
## GDP_per_unit_CO2            0.01       0.01         0.05       -0.12
## PPP_Conv_Rate              -0.01      -0.01         0.02       -0.13
## PPP_Share_GDP               0.00       0.01        -0.05        0.18
## Imports_PC                  1.00       0.52         0.17       -0.06
## Exports_PC                  0.52       1.00         0.10       -0.05
## Govt_Revenue                0.17       0.10         1.00        0.09
## gdp_per_cap                -0.06      -0.05         0.09        1.00
## agri_perc_gdp               0.02       0.02        -0.10       -0.56
## agg.empl.agri.perc          0.06       0.08        -0.03       -0.62
## rural.pop.perc              0.05       0.08        -0.07       -0.57
## pop.tot                     0.04       0.06        -0.07       -0.10
## mobilesub_per100peeps      -0.12      -0.11        -0.05        0.33
## intl_tourist_arrival       -0.03      -0.02        -0.04        0.28
## total_life_exp             -0.07      -0.05        -0.02        0.59
##                       agri_perc_gdp agg.empl.agri.perc rural.pop.perc pop.tot
## Max_Partners                  -0.51              -0.45          -0.33    0.22
## GDP_per_unit_CO2               0.41               0.46           0.39   -0.10
## PPP_Conv_Rate                  0.07               0.10           0.04    0.04
## PPP_Share_GDP                 -0.16              -0.13          -0.11    0.58
## Imports_PC                     0.02               0.06           0.05    0.04
## Exports_PC                     0.02               0.08           0.08    0.06
## Govt_Revenue                  -0.10              -0.03          -0.07   -0.07
## gdp_per_cap                   -0.56              -0.62          -0.57   -0.10
## agri_perc_gdp                  1.00               0.85           0.73    0.10
## agg.empl.agri.perc             0.85               1.00           0.82    0.20
## rural.pop.perc                 0.73               0.82           1.00    0.21
## pop.tot                        0.10               0.20           0.21    1.00
## mobilesub_per100peeps         -0.43              -0.41          -0.33   -0.06
## intl_tourist_arrival          -0.30              -0.28          -0.21    0.30
## total_life_exp                -0.73              -0.80          -0.69   -0.07
##                       mobilesub_per100peeps intl_tourist_arrival total_life_exp
## Max_Partners                           0.50                 0.49           0.46
## GDP_per_unit_CO2                      -0.06                -0.15          -0.41
## PPP_Conv_Rate                          0.03                -0.08          -0.05
## PPP_Share_GDP                          0.01                 0.61           0.15
## Imports_PC                            -0.12                -0.03          -0.07
## Exports_PC                            -0.11                -0.02          -0.05
## Govt_Revenue                          -0.05                -0.04          -0.02
## gdp_per_cap                            0.33                 0.28           0.59
## agri_perc_gdp                         -0.43                -0.30          -0.73
## agg.empl.agri.perc                    -0.41                -0.28          -0.80
## rural.pop.perc                        -0.33                -0.21          -0.69
## pop.tot                               -0.06                 0.30          -0.07
## mobilesub_per100peeps                  1.00                 0.22           0.46
## intl_tourist_arrival                   0.22                 1.00           0.32
## total_life_exp                         0.46                 0.32           1.00
corrplot(res, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

chart.Correlation(res, histogram=TRUE, pch=19)

###
# In the above plot:
# 
# The distribution of each variable is shown on the diagonal.
# On the bottom of the diagonal : the bivariate scatter plots with a fitted line are displayed
# On the top of the diagonal : the value of the correlation plus the significance level as stars
# Each significance level is associated to a symbol : p-values(0, 0.001, 0.01, 0.05, 0.1, 1) <=> symbols(“***”, “**”, “*”, “.”, " “)
###

Specific Cases: Thailand & South-Korea:

##############################
## Cases of Thailand and South Korea: 

data_thailand <- subset(mergedata, Country == "Thailand")
data_southkorea <- subset(mergedata, Country == "Korea, Republic of")

data_koreathailand <- subset(mergedata, Country == "Thailand" | Country == "Korea, Republic of")
head(data_thailand)
##       Country Year Max_Partners GDP_per_unit_CO2 PPP_Conv_Rate PPP_Share_GDP
## 2377 Thailand 1990          163         4.557155         9.391         0.880
## 2378 Thailand 1991          169         4.445433         9.564         0.932
## 2379 Thailand 1992          174         4.405588         9.728         0.914
## 2380 Thailand 1993          192         4.231155         9.721         0.975
## 2381 Thailand 1994          195         4.093368         9.962         1.022
## 2382 Thailand 1995          201         3.906547        10.318         1.066
##      Imports_PC Exports_PC Govt_Revenue gdp_per_cap agri_perc_gdp
## 2377     23.682     11.686       -0.777    2503.803     12.499628
## 2378     12.950     17.280       -0.777    2686.062     12.649827
## 2379      8.968     13.807       -0.777    2874.132     12.297336
## 2380     11.778     12.735       -0.777    3083.186      8.027850
## 2381     15.749     14.245       -0.777    3299.346      8.629039
## 2382     19.968     15.445       -0.777    3531.749      9.082753
##      agg.empl.agri.perc rural.pop.perc  pop.tot mobilesub_per100peeps
## 2377             60.334         70.576 56558186             0.1117840
## 2378             60.334         70.407 57232465             0.2158757
## 2379             60.905         70.237 57811021             0.4334537
## 2380             56.773         70.066 58337773             0.7089009
## 2381             55.988         69.895 58875269             1.2522796
## 2382             51.975         69.724 59467274             2.1824205
##      intl_tourist_arrival total_life_exp life_expectancy_fe life_exp_male
## 2377              6952000         70.248             73.433        67.175
## 2378              6952000         70.300             73.608        67.117
## 2379              6952000         70.281             73.727        66.976
## 2380              6952000         70.239             73.823        66.815
## 2381              6952000         70.202             73.915        66.668
## 2382              6952000         70.191             74.013        66.565
##      trade_perGDP
## 2377     75.78236
## 2378     78.47113
## 2379     77.95465
## 2380     77.74581
## 2381     81.24895
## 2382     89.75628
old.par <- par(mfrow=c(2, 3))
plot(data_thailand$gdp_per_cap,data_thailand$Max_Partners, col = "red",xlab = "GDP Per Capita (US Dollars)",ylab = "Number of Trading Partners", main = "Case 1: Thailand")
plot(data_thailand$GDP_per_unit_CO2,data_thailand$Max_Partners, col = "red",xlab = "GDP per CO2 unit",ylab = "Number of Trading Partners", main = "Case 1: Thailand")
plot(data_thailand$agri_perc_gdp,data_thailand$Max_Partners, col = "red",xlab = "Agricultural Percentage of GDP",ylab = "Number of Trading Partners", main = "Case 1: Thailand")
plot(data_thailand$total_life_exp,data_thailand$Max_Partners, col = "red",xlab = "Average Life Expectancy",ylab = "Number of Trading Partners", main = "Case 1: Thailand")
plot(data_thailand$intl_tourist_arrival,data_thailand$Max_Partners, col = "red",xlab = "Total Intl Tourist Arrival",ylab = "Number of Trading Partners", main = "Case 1: Thailand")
plot(data_thailand$mobilesub_per100peeps,data_thailand$Max_Partners, col = "red",xlab = "Mobile Subscription per 100 People",ylab = "Number of Trading Partners", main = "Case 1: Thailand")

par(old.par)

old.par2 <- par(mfrow=c(2, 3))
plot(data_southkorea$gdp_per_cap,data_southkorea$Max_Partners, col = "green",xlab = "GDP Per Capita (US Dollars)", ylab = "Number of Trading Partners", main = "Case 2: South Korea")
plot(data_southkorea$GDP_per_unit_CO2,data_southkorea$Max_Partners, col = "green",xlab = "GDP per CO2 unit", ylab = "Number of Trading Partners", main = "Case 2: South Korea")
plot(data_southkorea$agri_perc_gdp,data_southkorea$Max_Partners, col = "green",xlab = "Agricultural Percentage of GDP", ylab = "Number of Trading Partners", main = "Case 2: South Korea")
plot(data_southkorea$total_life_exp,data_southkorea$Max_Partners, col = "green",xlab = "Average Life Expectancy", ylab = "Number of Trading Partners", main = "Case 2: South Korea")
plot(data_southkorea$intl_tourist_arrival,data_southkorea$Max_Partners, col = "green",xlab = "Total Intl Tourist Arrival", ylab = "Number of Trading Partners", main = "Case 2: South Korea")
plot(data_southkorea$mobilesub_per100peeps,data_southkorea$Max_Partners, col = "green",xlab = "Mobile Subscription per 100 People", ylab = "Number of Trading Partners", main = "Case 2: South Korea")

par(old.par2)

Interactive Charts for selected countries:

library(gapminder)
library(ggplot2)
library(gganimate)
library(gifski)

data_interactive <- subset(mergedata, Country == "Thailand" | Country == "Korea, Republic of" | Country == "Malaysia" | Country == "Poland" | Country == "Mexico")


myPlot <- ggplot(data_interactive, aes(mobilesub_per100peeps,Max_Partners, size = total_life_exp, color = Country))+        geom_point() +
  scale_x_log10() +
  theme_bw()  +
  # gganimate specific bits:
  labs(title = 'Year: {frame_time}', x = 'Mobile Subscription per 100 people', y ='Number of Trading Partners') +           transition_time(Year) +
  ease_aes('linear')

# Save at gif: 
animate(myPlot, duration = 5, fps = 20, width = 350, height = 200, renderer = gifski_renderer())
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

anim_save("output.gif")

Principal Component Analysis:

############ Principal Component Analysis: 

## First, we need to use the stationary data over the entire time span to do PCA instead of using time-series data.

avg_all <- mergedata %>% group_by(Country) %>% summarise_at(vars(Max_Partners:trade_perGDP), mean, na.rm=TRUE)
avg_all <- data.frame(avg_all)

avg_all$ID <- seq.int(nrow(avg_all))
country_id <- subset(avg_all, select=c("Country","ID"))

sapply(avg_all, class) # check to see if numeric before PCA.
##               Country          Max_Partners      GDP_per_unit_CO2 
##              "factor"             "numeric"             "numeric" 
##         PPP_Conv_Rate         PPP_Share_GDP            Imports_PC 
##             "numeric"             "numeric"             "numeric" 
##            Exports_PC          Govt_Revenue           gdp_per_cap 
##             "numeric"             "numeric"             "numeric" 
##         agri_perc_gdp    agg.empl.agri.perc        rural.pop.perc 
##             "numeric"             "numeric"             "numeric" 
##               pop.tot mobilesub_per100peeps  intl_tourist_arrival 
##             "numeric"             "numeric"             "numeric" 
##        total_life_exp    life_expectancy_fe         life_exp_male 
##             "numeric"             "numeric"             "numeric" 
##          trade_perGDP                    ID 
##             "numeric"             "integer"
# remove "country"

avg_all_no_countryname <- subset(avg_all, select=c(-Country))

# turning the ID column into row name instead of a column
samp2 <- avg_all_no_countryname[,-19]
rownames(samp2) <- avg_all_no_countryname[,19]

pca_data <- subset(samp2, select = c(-Max_Partners))
pr.out = prcomp(pca_data, scale = TRUE)

names(pr.out)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
pr.out$center
##      GDP_per_unit_CO2         PPP_Conv_Rate         PPP_Share_GDP 
##          5.134184e+00          8.619720e+01          9.432858e-01 
##            Imports_PC            Exports_PC          Govt_Revenue 
##          6.591844e+00          5.791484e+00         -1.627273e-02 
##           gdp_per_cap         agri_perc_gdp    agg.empl.agri.perc 
##          1.605263e+04          9.791403e+00          2.240515e+01 
##        rural.pop.perc               pop.tot mobilesub_per100peeps 
##          3.690374e+01          5.423977e+07          5.229527e+01 
##  intl_tourist_arrival        total_life_exp    life_expectancy_fe 
##          7.075404e+06          7.166571e+01          7.454690e+01 
##         life_exp_male          trade_perGDP 
##          6.890341e+01          8.050765e+01
pr.out$scale
##      GDP_per_unit_CO2         PPP_Conv_Rate         PPP_Share_GDP 
##          3.174002e+00          2.838482e+02          2.324133e+00 
##            Imports_PC            Exports_PC          Govt_Revenue 
##          3.080442e+00          3.259744e+00          2.121656e+00 
##           gdp_per_cap         agri_perc_gdp    agg.empl.agri.perc 
##          1.920443e+04          8.917634e+00          1.965072e+01 
##        rural.pop.perc               pop.tot mobilesub_per100peeps 
##          1.861833e+01          1.714795e+08          1.765922e+01 
##  intl_tourist_arrival        total_life_exp    life_expectancy_fe 
##          1.199943e+07          7.625178e+00          7.997949e+00 
##         life_exp_male          trade_perGDP 
##          7.447609e+00          4.049715e+01
pr.out$rotation
##                               PC1         PC2         PC3          PC4
## GDP_per_unit_CO2      -0.22104184  0.13088713 -0.17977715 -0.306270870
## PPP_Conv_Rate         -0.06370749  0.01415015 -0.09116653 -0.474835678
## PPP_Share_GDP          0.07006169 -0.55388903 -0.18870033 -0.046481800
## Imports_PC            -0.09328565 -0.18135920  0.60179388 -0.248026788
## Exports_PC            -0.06559144 -0.21658963  0.62694760 -0.155949832
## Govt_Revenue           0.04641659  0.17303334 -0.05867869 -0.676410820
## gdp_per_cap            0.27649157  0.01862988 -0.07747907 -0.135168722
## agri_perc_gdp         -0.33888797 -0.00221355 -0.01705228  0.131116455
## agg.empl.agri.perc    -0.35342167 -0.04430116 -0.01405742  0.021551453
## rural.pop.perc        -0.31407920 -0.05941447  0.02145071  0.144993974
## pop.tot               -0.05774140 -0.51936282  0.00938868  0.017381679
## mobilesub_per100peeps  0.33293425  0.07253167  0.10428850 -0.062307144
## intl_tourist_arrival   0.13397772 -0.44417899 -0.19130813 -0.029527203
## total_life_exp         0.35218987 -0.03647493  0.03806898  0.033488717
## life_expectancy_fe     0.35156580 -0.03646336  0.06342725  0.061852092
## life_exp_male          0.34581696 -0.03617717  0.01197747  0.002586423
## trade_perGDP           0.11319941  0.28987156  0.32398824  0.261227086
##                                PC5         PC6         PC7          PC8
## GDP_per_unit_CO2      -0.128415266 -0.62186064 -0.20358913  0.076046905
## PPP_Conv_Rate          0.776380135 -0.10200189  0.21816519 -0.271105455
## PPP_Share_GDP         -0.070587417 -0.03041090  0.16954950 -0.186944672
## Imports_PC            -0.070307493  0.07222033 -0.21467075 -0.185012484
## Exports_PC            -0.006238384 -0.21855785 -0.07953351  0.090086386
## Govt_Revenue          -0.346330480  0.31761176  0.27151982  0.310691284
## gdp_per_cap           -0.270127073 -0.48088140  0.20003499 -0.005394304
## agri_perc_gdp          0.061357606 -0.16070319 -0.02009671  0.140027346
## agg.empl.agri.perc     0.012031810 -0.11649623  0.05127892  0.226271478
## rural.pop.perc         0.001110643 -0.20460380  0.15363706  0.270695498
## pop.tot                0.059907503  0.11043591  0.41619067  0.386374641
## mobilesub_per100peeps -0.122720202 -0.07172726  0.01237027 -0.111376141
## intl_tourist_arrival  -0.193281749 -0.18517231 -0.05817219 -0.330131119
## total_life_exp         0.200399102 -0.10657608 -0.12276407  0.312960547
## life_expectancy_fe     0.195842879 -0.07051640 -0.14676484  0.253089174
## life_exp_male          0.197523878 -0.14177652 -0.09094713  0.366310739
## trade_perGDP          -0.020436815 -0.21801788  0.68395212 -0.189541539
##                               PC9         PC10        PC11         PC12
## GDP_per_unit_CO2       0.17017139 -0.489709094  0.04829099 -0.218648030
## PPP_Conv_Rate         -0.04466353  0.112854574  0.09250038  0.053684426
## PPP_Share_GDP          0.11702435  0.060504294 -0.47552721 -0.455530556
## Imports_PC             0.16403288 -0.284266830  0.01633140  0.295167561
## Exports_PC            -0.14850583  0.325479039 -0.10559283 -0.282774283
## Govt_Revenue          -0.28537225 -0.008281377 -0.13904031  0.016854766
## gdp_per_cap            0.34170382  0.432413526 -0.02513892  0.326282514
## agri_perc_gdp         -0.06143913  0.227137034 -0.27854181  0.439391286
## agg.empl.agri.perc    -0.13530008  0.155906559 -0.11344498  0.146148265
## rural.pop.perc        -0.35472249  0.114413780  0.37377244 -0.283674968
## pop.tot                0.35216613 -0.230242058  0.34540565  0.164394460
## mobilesub_per100peeps  0.01916796  0.256630356  0.53252175 -0.156045993
## intl_tourist_arrival  -0.61067900 -0.156170935  0.15069043  0.337509412
## total_life_exp        -0.11155224 -0.097996342 -0.11274389  0.032060543
## life_expectancy_fe    -0.14331176 -0.082011771 -0.06245623 -0.017569002
## life_exp_male         -0.07536078 -0.112042991 -0.15595122  0.080309200
## trade_perGDP          -0.15590070 -0.329943105 -0.18686580 -0.001808859
##                               PC13         PC14          PC15         PC16
## GDP_per_unit_CO2       0.079122659 -0.171149543  0.0286076504  0.046596989
## PPP_Conv_Rate         -0.024169918  0.028721645  0.0003148129  0.010842994
## PPP_Share_GDP         -0.330323975 -0.131710519 -0.0526051193 -0.026619776
## Imports_PC            -0.473778501  0.116091957 -0.1005260366 -0.033278741
## Exports_PC             0.477891216 -0.025764460  0.1443274421  0.013391477
## Govt_Revenue          -0.061253429 -0.057980098  0.0705035245  0.051472000
## gdp_per_cap           -0.064162829  0.361567158 -0.0165068093  0.093368755
## agri_perc_gdp         -0.215394582 -0.517837612  0.4148087576  0.073707243
## agg.empl.agri.perc     0.034310381 -0.098453290 -0.8410645065 -0.099031712
## rural.pop.perc        -0.474590643  0.354077221  0.1669867724 -0.001944363
## pop.tot                0.210614995 -0.111434525  0.0635515970  0.063828136
## mobilesub_per100peeps -0.219199401 -0.612250463 -0.1437537302 -0.136551022
## intl_tourist_arrival   0.169681021  0.009395165  0.0152259276 -0.014726346
## total_life_exp        -0.096098751 -0.007559637 -0.0416536151  0.033420746
## life_expectancy_fe    -0.126253407 -0.056882726 -0.1390599131  0.697872189
## life_exp_male         -0.064352189  0.044090009  0.0720007550 -0.676774029
## trade_perGDP          -0.001857863 -0.092134651 -0.0412569453 -0.001455884
##                                PC17
## GDP_per_unit_CO2      -2.697424e-04
## PPP_Conv_Rate         -2.133766e-04
## PPP_Share_GDP          2.707183e-05
## Imports_PC             7.642497e-05
## Exports_PC             4.191007e-04
## Govt_Revenue           2.866759e-04
## gdp_per_cap           -1.683234e-04
## agri_perc_gdp         -8.356154e-05
## agg.empl.agri.perc    -3.863747e-03
## rural.pop.perc         2.288787e-03
## pop.tot                8.564887e-04
## mobilesub_per100peeps  2.512656e-03
## intl_tourist_arrival   3.758977e-04
## total_life_exp         8.121559e-01
## life_expectancy_fe    -4.257813e-01
## life_exp_male         -3.988545e-01
## trade_perGDP           6.007918e-04
biplot(pr.out, scale = 0)

#### Instead of doing PCA for all variables, we are going to include some selected variables from the exploratory analysis section. 

subset_pcadata <- subset(mergedata,select =c("Country", "Year", "gdp_per_cap","GDP_per_unit_CO2","agri_perc_gdp","total_life_exp","intl_tourist_arrival","mobilesub_per100peeps","agg.empl.agri.perc"))

subset_pcaavg <- subset_pcadata %>% group_by(Country) %>% summarise_at(vars(gdp_per_cap:agg.empl.agri.perc),mean,na.rm=TRUE)

subset_pcaavg <- data.frame(subset_pcaavg)

subset_pcaavg$ID <- seq.int(nrow(subset_pcaavg))
country_id2 <- subset(subset_pcaavg, select=c("Country","ID"))

sapply(subset_pcaavg, class) # check to see if numeric before PCA.
##               Country           gdp_per_cap      GDP_per_unit_CO2 
##              "factor"             "numeric"             "numeric" 
##         agri_perc_gdp        total_life_exp  intl_tourist_arrival 
##             "numeric"             "numeric"             "numeric" 
## mobilesub_per100peeps    agg.empl.agri.perc                    ID 
##             "numeric"             "numeric"             "integer"
# remove "country"

subset_pca_no_countryname <- subset(subset_pcaavg, select=c(-Country))

# turning the ID column into row name instead of a column
samp3 <- subset_pca_no_countryname[,-8]
rownames(samp3) <- subset_pca_no_countryname[,8]

pr.out3 = prcomp(samp3, scale = TRUE)

pr.out3$rotation
##                              PC1         PC2         PC3         PC4
## gdp_per_cap            0.3543596 -0.17954409 -0.55251248  0.66026780
## GDP_per_unit_CO2      -0.2885919 -0.27103080 -0.75656495 -0.49812578
## agri_perc_gdp         -0.4351674 -0.07567857  0.01551139  0.46716951
## total_life_exp         0.4241552  0.05188499  0.01631303 -0.10036049
## intl_tourist_arrival   0.1905211 -0.92531747  0.30548353 -0.06432859
## mobilesub_per100peeps  0.4311152  0.11166338 -0.16775681  0.02121868
## agg.empl.agri.perc    -0.4468860 -0.13117088  0.01924012  0.28811309
##                               PC5         PC6         PC7
## gdp_per_cap           -0.06136437  0.31110275  0.02047082
## GDP_per_unit_CO2       0.11260265 -0.08888081 -0.04645471
## agri_perc_gdp          0.31554845 -0.41268703 -0.56261243
## total_life_exp         0.89020008 -0.04305645  0.11292441
## intl_tourist_arrival  -0.04880529 -0.07915067 -0.03720449
## mobilesub_per100peeps -0.26254540 -0.82964801  0.12582723
## agg.empl.agri.perc     0.14218302 -0.16902717  0.80679711
biplot(pr.out3, scale = 0)

Train and Test Set Split:

# train and test set

set.seed(123)

index <- sample(1:nrow(mergedata),nrow(mergedata)*0.75)
train = mergedata[index,]
test = mergedata[-index,]

nrow(train)
## [1] 2004
nrow(test)
## [1] 669

MODELING: Best Subset Selection

library(leaps)

regfit.full = regsubsets(Max_Partners ~ Year + GDP_per_unit_CO2 + PPP_Conv_Rate + Govt_Revenue + gdp_per_cap +agri_perc_gdp + mobilesub_per100peeps+ intl_tourist_arrival + total_life_exp + agg.empl.agri.perc + rural.pop.perc, data = train)

summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Max_Partners ~ Year + GDP_per_unit_CO2 + PPP_Conv_Rate + 
##     Govt_Revenue + gdp_per_cap + agri_perc_gdp + mobilesub_per100peeps + 
##     intl_tourist_arrival + total_life_exp + agg.empl.agri.perc + 
##     rural.pop.perc, data = train)
## 11 Variables  (and intercept)
##                       Forced in Forced out
## Year                      FALSE      FALSE
## GDP_per_unit_CO2          FALSE      FALSE
## PPP_Conv_Rate             FALSE      FALSE
## Govt_Revenue              FALSE      FALSE
## gdp_per_cap               FALSE      FALSE
## agri_perc_gdp             FALSE      FALSE
## mobilesub_per100peeps     FALSE      FALSE
## intl_tourist_arrival      FALSE      FALSE
## total_life_exp            FALSE      FALSE
## agg.empl.agri.perc        FALSE      FALSE
## rural.pop.perc            FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          Year GDP_per_unit_CO2 PPP_Conv_Rate Govt_Revenue gdp_per_cap
## 1  ( 1 ) " "  " "              " "           " "          " "        
## 2  ( 1 ) " "  " "              " "           " "          " "        
## 3  ( 1 ) "*"  " "              " "           " "          "*"        
## 4  ( 1 ) "*"  " "              " "           " "          "*"        
## 5  ( 1 ) "*"  " "              " "           " "          "*"        
## 6  ( 1 ) "*"  " "              "*"           " "          "*"        
## 7  ( 1 ) "*"  " "              "*"           "*"          "*"        
## 8  ( 1 ) "*"  "*"              "*"           "*"          "*"        
##          agri_perc_gdp mobilesub_per100peeps intl_tourist_arrival
## 1  ( 1 ) "*"           " "                   " "                 
## 2  ( 1 ) " "           "*"                   "*"                 
## 3  ( 1 ) " "           " "                   "*"                 
## 4  ( 1 ) "*"           " "                   "*"                 
## 5  ( 1 ) "*"           " "                   "*"                 
## 6  ( 1 ) "*"           " "                   "*"                 
## 7  ( 1 ) "*"           " "                   "*"                 
## 8  ( 1 ) "*"           " "                   "*"                 
##          total_life_exp agg.empl.agri.perc rural.pop.perc
## 1  ( 1 ) " "            " "                " "           
## 2  ( 1 ) " "            " "                " "           
## 3  ( 1 ) " "            " "                " "           
## 4  ( 1 ) " "            " "                " "           
## 5  ( 1 ) " "            " "                "*"           
## 6  ( 1 ) " "            " "                "*"           
## 7  ( 1 ) " "            " "                "*"           
## 8  ( 1 ) " "            " "                "*"
reg.summary = summary(regfit.full)
names(reg.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
reg.summary$rsq
## [1] 0.2661101 0.4023578 0.4815589 0.5127110 0.5238906 0.5321853 0.5355719
## [8] 0.5371904
par(mfrow = c(2,2))
plot(reg.summary$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
plot(reg.summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted RSq", type = "l")
p_max = which.max(reg.summary$adjr2)
points(p_max,reg.summary$adjr2[p_max],col="red",cex=2,pch=20)

plot(reg.summary$cp, xlab = "Number of Variables",ylab = "Cp", type ='l')
cp_min = which.min(reg.summary$cp)
points(cp_min, reg.summary$cp[cp_min], col ="red", cex = 2, pch = 20)

plot(reg.summary$bic, xlab = "Number of Variables", ylab = "BIC", type = 'l')
bic_min = which.min(reg.summary$bic)
points(bic_min,reg.summary$bic[bic_min], col = "red", cex=2, pch=20)

#coef(regfit.full,bic_min)

Modeling: Forward Stepwise Selection

regfit.fwd = regsubsets(Max_Partners ~ Year + GDP_per_unit_CO2 + PPP_Conv_Rate + Govt_Revenue + gdp_per_cap +agri_perc_gdp + mobilesub_per100peeps+ intl_tourist_arrival + total_life_exp + agg.empl.agri.perc + rural.pop.perc, data = train, method = "forward")

regfwd.summary = summary(regfit.fwd)

par(mfrow = c(2,2))
plot(regfwd.summary$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
plot(regfwd.summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted RSq", type = "l")
pfwd_max = which.max(reg.summary$adjr2)
points(pfwd_max,regfwd.summary$adjr2[pfwd_max],col="red",cex=2,pch=20)

plot(regfwd.summary$cp, xlab = "Number of Variables",ylab = "Cp", type ='l')
cpfwd_min = which.min(regfwd.summary$cp)
points(cpfwd_min, regfwd.summary$cp[cpfwd_min], col ="red", cex = 2, pch = 20)

plot(regfwd.summary$bic, xlab = "Number of Variables", ylab = "BIC", type = 'l')
bicfwd_min = which.min(regfwd.summary$bic)
points(bicfwd_min,regfwd.summary$bic[bic_min], col = "red", cex=2, pch=20)

# majority vote here: BIC and Cp choose 6 var for best model
coef(regfit.fwd,6)
##          (Intercept)                 Year        PPP_Conv_Rate 
##        -2.967904e+03         1.556104e+00         1.359310e-02 
##          gdp_per_cap        agri_perc_gdp intl_tourist_arrival 
##         6.253637e-04        -1.414467e+00         1.040822e-06 
##       rural.pop.perc 
##         3.848589e-01

Modeling: Backward Stepwise Selection

regfit.bwd = regsubsets(Max_Partners ~ Year + GDP_per_unit_CO2 + PPP_Conv_Rate + Govt_Revenue + gdp_per_cap +agri_perc_gdp + mobilesub_per100peeps+ intl_tourist_arrival + total_life_exp + agg.empl.agri.perc + rural.pop.perc, data = train, method = "backward")

regbwd.summary = summary(regfit.bwd)

par(mfrow = c(2,2))
plot(regbwd.summary$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
plot(regbwd.summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted RSq", type = "l")
pbwd_max = which.max(reg.summary$adjr2)
points(pbwd_max,regbwd.summary$adjr2[pbwd_max],col="red",cex=2,pch=20)

plot(regbwd.summary$cp, xlab = "Number of Variables",ylab = "Cp", type ='l')
cpbwd_min = which.min(regbwd.summary$cp)
points(cpbwd_min, regbwd.summary$cp[cpbwd_min], col ="red", cex = 2, pch = 20)

plot(regbwd.summary$bic, xlab = "Number of Variables", ylab = "BIC", type = 'l')
bicbwd_min = which.min(regbwd.summary$bic)
points(bicbwd_min,regbwd.summary$bic[bic_min], col = "red", cex=2, pch=20)

# majority vote here: BIC and Cp choose 6 var for best model
coef(regfit.bwd,6)
##          (Intercept)                 Year        PPP_Conv_Rate 
##        -2.967904e+03         1.556104e+00         1.359310e-02 
##          gdp_per_cap        agri_perc_gdp intl_tourist_arrival 
##         6.253637e-04        -1.414467e+00         1.040822e-06 
##       rural.pop.perc 
##         3.848589e-01

Modeling: Ridge Regression

library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 3.0-2
x = model.matrix(Max_Partners ~ Year + GDP_per_unit_CO2 + PPP_Conv_Rate + Govt_Revenue + gdp_per_cap+ intl_tourist_arrival + agri_perc_gdp+ rural.pop.perc, data = mergedata)[,-1]
y = mergedata$Max_Partners
grid = 10^ seq(10,-2,length = 100)

#ridge.pred = predict(ridge.mod, s=4, data=test)
#mean((ridge.pred-test$Max_Partners)^2)

set.seed(1)

train_ridge = sample(1:nrow(mergedata), nrow(mergedata)/2)
test_ridge = (-train_ridge)
y.test_ridge = y[test_ridge]

ridge.mod = glmnet(x[train_ridge,], y[train_ridge],alpha = 0, lambda = grid)
ridge.pred = predict(ridge.mod, s=4,newx=x[test_ridge,])
mean((ridge.pred-y.test_ridge)^2)
## [1] 852.5305
# use cross-validation to choose the tuning parameter lambda. 

cv.out = cv.glmnet(x[train_ridge,], y[train_ridge], alpha = 0)
plot(cv.out)

bestlam = cv.out$lambda.min
bestlam
## [1] 2.040658
# find the test MSE associated with this bestlam:

ridge.pred = predict(ridge.mod, s=bestlam, newx=x[test_ridge,])
mse_ridge = mean((ridge.pred-y.test_ridge)^2)
mse_ridge
## [1] 846.0168
out.ridge = glmnet(x,y, alpha = 0)
predict(out.ridge,type = "coefficients",s = bestlam)
## 9 x 1 sparse Matrix of class "dgCMatrix"
##                                  1
## (Intercept)          -2.944693e+03
## Year                  1.546896e+00
## GDP_per_unit_CO2     -6.202720e-01
## PPP_Conv_Rate         1.125691e-02
## Govt_Revenue         -3.774558e-01
## gdp_per_cap           6.219204e-04
## intl_tourist_arrival  1.010499e-06
## agri_perc_gdp        -1.159886e+00
## rural.pop.perc        2.982004e-01

Modeling: LASSO Regression:

lasso.mod = glmnet(x[train_ridge,], y[train_ridge], alpha = 1, lambda = grid)
plot(lasso.mod)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

cv.out.lasso = cv.glmnet(x[train_ridge,], y[train_ridge], alpha = 1)
plot(cv.out.lasso)

bestlam_lasso = cv.out.lasso$lambda.min
lasso.pred.lasso = predict(lasso.mod,s=bestlam_lasso,newx=x[test_ridge,])

mse_lasso <- mean((lasso.pred.lasso-y.test_ridge)^2)
mse_lasso
## [1] 841.7351
out = glmnet(x,y,alpha = 1, lambda = grid)
lasso.coef = predict(out, type ="coefficients", s = bestlam_lasso)
lasso.coef
## 9 x 1 sparse Matrix of class "dgCMatrix"
##                                  1
## (Intercept)          -3.043913e+03
## Year                  1.595071e+00
## GDP_per_unit_CO2     -6.699200e-01
## PPP_Conv_Rate         1.205399e-02
## Govt_Revenue         -3.870324e-01
## gdp_per_cap           6.621936e-04
## intl_tourist_arrival  1.037427e-06
## agri_perc_gdp        -1.262021e+00
## rural.pop.perc        3.815519e-01
lasso.coef[lasso.coef!=0]
## <sparse>[ <logic> ] : .M.sub.i.logical() maybe inefficient
## [1] -3.043913e+03  1.595071e+00 -6.699200e-01  1.205399e-02 -3.870324e-01
## [6]  6.621936e-04  1.037427e-06 -1.262021e+00  3.815519e-01

Model: Linear Regression:

ln_fit <- lm(Max_Partners ~ Year + GDP_per_unit_CO2 + PPP_Conv_Rate + Govt_Revenue + gdp_per_cap+ intl_tourist_arrival + agri_perc_gdp+ rural.pop.perc, data = train)

summary(ln_fit)
## 
## Call:
## lm(formula = Max_Partners ~ Year + GDP_per_unit_CO2 + PPP_Conv_Rate + 
##     Govt_Revenue + gdp_per_cap + intl_tourist_arrival + agri_perc_gdp + 
##     rural.pop.perc, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -98.103 -18.835   2.385  21.428  69.103 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.983e+03  1.760e+02 -16.949  < 2e-16 ***
## Year                  1.564e+00  8.788e-02  17.800  < 2e-16 ***
## GDP_per_unit_CO2     -5.533e-01  2.095e-01  -2.641 0.008322 ** 
## PPP_Conv_Rate         1.472e-02  2.295e-03   6.415 1.76e-10 ***
## Govt_Revenue         -5.238e-01  1.484e-01  -3.530 0.000426 ***
## gdp_per_cap           6.566e-04  4.342e-05  15.122  < 2e-16 ***
## intl_tourist_arrival  1.015e-06  5.451e-08  18.613  < 2e-16 ***
## agri_perc_gdp        -1.381e+00  1.074e-01 -12.863  < 2e-16 ***
## rural.pop.perc        4.186e-01  5.352e-02   7.822 8.35e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.6 on 1995 degrees of freedom
## Multiple R-squared:  0.5372, Adjusted R-squared:  0.5353 
## F-statistic: 289.5 on 8 and 1995 DF,  p-value: < 2.2e-16
plot(ln_fit)

ano_result <- anova(ln_fit)
#ano_result$`Mean Sq`[8] # residual mean sqrt error for the test set. 

library(sjPlot)
tab_model(ln_fit)
  Max_Partners
Predictors Estimates CI p
(Intercept) -2983.48 -3328.69 – -2638.26 <0.001
Year 1.56 1.39 – 1.74 <0.001
GDP_per_unit_CO2 -0.55 -0.96 – -0.14 0.008
PPP_Conv_Rate 0.01 0.01 – 0.02 <0.001
Govt_Revenue -0.52 -0.81 – -0.23 <0.001
gdp_per_cap 0.00 0.00 – 0.00 <0.001
intl_tourist_arrival 0.00 0.00 – 0.00 <0.001
agri_perc_gdp -1.38 -1.59 – -1.17 <0.001
rural.pop.perc 0.42 0.31 – 0.52 <0.001
Observations 2004
R2 / R2 adjusted 0.537 / 0.535
train_MSE = mean(ln_fit$residuals)^2
train_MSE
## [1] 3.952722e-32
# applying the linear model to the test set: 
test_pred <- predict(ln_fit, test)
test_MSE <- mean((test$Max_Partners-test_pred)^2)
test_MSE
## [1] 889.5553

Modeling: Decision Tree, Bagging, Random Forest, Boosting

Decision Tree

library(rpart)
library(rattle)
## Registered S3 method overwritten by 'rattle':
##   method         from      
##   predict.kmeans parameters
## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(tree)
## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli
mergedata <- read.csv("merged.csv") %>% select(-Country, -life_exp_male, -life_expectancy_fe)
set.seed(123)
splitIndex <- createDataPartition(y=1:nrow(mergedata),p=0.75,list=FALSE)
test  <- mergedata[-splitIndex,]
train <- mergedata[splitIndex,]

tree_simple <- rpart(Max_Partners ~ ., data=train)

pred_test <- predict(tree_simple,newdata = test)
tree_mse <- sum((pred_test - test$Max_Partners)^2/nrow(test), na.rm = TRUE)
print(tree_mse)
## [1] 581.0472
#plot(ctree(Max_Partners ~ ., data=train))
fancyRpartPlot(tree_simple, caption = 'Full tree')

savePlotToFile('simple_tree.png')
## [1] TRUE

Prune tree

pruned_tree <- prune(tree_simple, cp = 0.05)

pred_test <- predict(pruned_tree,newdata = test)
test_mse <- sum((pred_test - test$Max_Partners)^2/nrow(test), na.rm = TRUE)

fancyRpartPlot(pruned_tree, caption = 'Pruned Tree, cp = 0.05')

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
## 
##     importance
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(randomForestExplainer)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(caret)
library(parallel)
param_list <- list(mtry = c(4,8,16), 
ntree = c(1,seq(2,500,2)))
param_df_forest <- do.call(expand.grid, param_list) %>% mutate(test_mse = 0, train_mse = 0)
forest_run <- function(i){
  mergedata <-  select(read.csv("merged.csv"),-Country, -life_exp_male, -life_expectancy_fe)
  set.seed(123)
  splitIndex <- sample(1:5, nrow(mergedata), replace = TRUE)
  
  temp_test_mse <- 0
  list_return <- list()
  param_list <- list(mtry = c(4,8,16), 
  ntree = c(1,seq(2,500,2)))
  param_df_forest <- do.call(expand.grid, param_list) %>% mutate(test_mse = 0)
  for (iter in 1:5){
    test  <- mergedata[splitIndex != iter,]
    train <- mergedata[splitIndex == iter,]
    
    randomForest_model <- randomForest(Max_Partners ~ ., 
                     data = train, 
                     mtry = param_df_forest[i,'mtry'], 
                     ntree = param_df_forest[i,'ntree'],
                     localImp = TRUE)
    
    pred_test <- predict(randomForest_model,newdata = test)
    temp_test_mse <- temp_test_mse +  sum((pred_test - test$Max_Partners)^2/nrow(test), na.rm = TRUE)
  }
  list_return$test_mse <- temp_test_mse / 5
  
  return(list_return)
}
###### WARNING DON'T RUN THIS CODE UNLESS YOU HAVE A LOT OF CORES
result_list_forest <- mclapply(X = 1:nrow(param_df_forest), FUN = forest_run, mc.cores = 24)
# Gathering results
model_vec <- c()
test_mse_vec <- c()
train_mse_vec <- c()
for (i in 1:length(result_list_forest)){
  model_vec <- c(model_vec, result_list_forest[[i]][['model']])
  test_mse_vec <- c(test_mse_vec, result_list_forest[[i]][['test_mse']])
  train_mse_vec <- c(train_mse_vec, result_list_forest[[i]][['train_mse']])
}
decision_df <- param_df_forest
decision_df['test_mse'] <- test_mse_vec
decision_df['train_mse'] <- train_mse_vec

# making plot of trees to error
plot_testerrorrandom <- ggplot(data = decision_df %>% mutate(mtry = as.factor(mtry)), aes(x = ntree, y = test_mse, color = mtry)) + 
  geom_vline(xintercept = 150, linetype = 'dashed') + 
  geom_line(size = 1.1) +
  ylab("CV Test MSE") + 
  xlab("Number of Trees") + 
  ggtitle("Test Error as a Function of Forest Size") +
  scale_color_manual(values = c("4" = "blue", "8" = "goldenrod", "16" = "forestgreen"), labels = c(expression("m = sqrt(p), 4", "m = p/2, 8","m = p, 16"))) +
  theme_bw() + ylim(230,300)
  theme(legend.title = element_blank())
## List of 1
##  $ legend.title: list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE
plot_testerrorrandom_zoom <- ggplot(data = decision_df %>% filter(ntree > 10 & ntree < 150) %>% mutate(mtry = as.factor(mtry)), aes(x = ntree, y = test_mse, color = mtry)) + 
  geom_line(size = 1.1) +
  ylab("CV Test MSE") + 
  xlab("Number of Trees") + 
  ggtitle("Test Error as a Function of Tree #") +
  scale_color_manual(values = c("4" = "blue", "9" = "goldenrod", "18" = "forestgreen"), labels = c(expression("m = sqrt(p), 4", "m = p/2, 9","m = p, 18"))) +
  theme(legend.title = element_blank()) + 
  geom_point(data = data.frame(y = 76.119, x = 120), aes(x = x, y = y), color = "red", size = 5, shape = 2)

plot_testerrorrandom
## Warning: Removed 15 rows containing missing values (geom_path).

ggsave('forest_testerror_plot.png',plot_testerrorrandom, height = 6, width = 8)
## Warning: Removed 15 rows containing missing values (geom_path).
mergedata <-  select(read.csv("merged.csv"),-Country, -life_exp_male, -life_expectancy_fe)
set.seed(123)
splitIndex <- createDataPartition(y=1:nrow(mergedata),p=0.75,list=FALSE)
test  <- mergedata[-splitIndex,]
train <- mergedata[splitIndex,]

best_forest <-     randomForest(Max_Partners ~ ., 
                   data = train, 
                   mtry = 4, 
                   ntree = 150,
                   localImp = TRUE
                 )
best_bag <-     randomForest(Max_Partners ~ ., 
                   data = train, 
                   mtry = 16, 
                   ntree = 150,
                   localImp = TRUE
                 )
min_depth_frame_forest <- min_depth_distribution(best_forest)
min_depth_frame_bag <- min_depth_distribution(best_bag)

plot_depth_forest <- plot_min_depth_distribution(min_depth_frame_forest)
plot_depth_bag <- plot_min_depth_distribution(min_depth_frame_bag)
plot_depth_forest

ggsave('forest_depth_plot_forest.png',plot_depth_forest, height = 6, width = 8)

ggsave('forest_depth_plot_bag.png',plot_depth_bag, height = 6, width = 8)
importance_frame_forest <- measure_importance(best_forest)
importance_frame_bag <- measure_importance(best_bag)
#plot_predict_interaction(best_forest, test, "Intl_tourist_arrival", "mobilesub_per100peeps")

plot_importance_bag <- plot_multi_way_importance(importance_frame_bag, x_measure = "mse_increase", size_measure = "no_of_nodes", main = "Bagging: Multi-way Importance Plot")
plot_importance_forest <- plot_multi_way_importance(importance_frame_forest, x_measure = "mse_increase", size_measure = "no_of_nodes", main = "Forest p = 4: Multi-way Importance Plot")
#plot_importance_forest
ggsave('forest_plot_importance.png',plot_importance_forest, height = 6, width = 8)
ggsave('bag_plot_importance.png',plot_importance_bag, height = 6, width = 8)

figure1 <- ggarrange(plot_depth_bag,plot_depth_forest, 
                    ncol = 2, nrow = 1, labels = c("Bagging","Forest p=4"))
figure1

ggsave('bag_forest_plots1.png',figure1, height = 5, width = 12)

figure2 <- ggarrange(plot_importance_bag, plot_importance_forest,
                    ncol = 2, nrow = 1)
figure2

ggsave('bag_forest_plots2.png',figure2, height = 5, width = 12)

GBM Modeling

library("gbm")
## Loaded gbm 2.1.5
library(RColorBrewer)

param_list <- list(interaction.depth = c(1,2,3,4,5,6,7,8), 
                   shrinkage = c(0.5,0.1,0.01,0.001),
                   ntree = c(seq(10,2000,10))
                 )
param_df_gbm <- do.call(expand.grid, param_list) %>% mutate(test_mse = 0, train_mse = 0)
  
gbm_run <- function(i){
  mergedata <- read.csv("merged.csv") %>% select(-Country, -life_exp_male, -life_expectancy_fe)
  set.seed(123)
  

  param_list <- list(interaction.depth = c(1,2,3,4,5,6,7,8), 
                   shrinkage = c(1,0.1,0.01,0.001),
                   ntree = c(seq(10,2000,10))
                   )
  param_df_gbm <- do.call(expand.grid, param_list) %>% mutate(test_mse = 0, train_mse = 0)
  
  splitIndex <- sample(1:5, nrow(mergedata), replace = TRUE)
  
  list_return <- list()
  temp_test_mse <- 0
  for (iter in 1:5){

    test  <- mergedata[splitIndex != iter,]
    train <- mergedata[splitIndex == iter,]
    
    gbm_model <- gbm(Max_Partners ~ ., 
                   data = train, 
                   distribution = 'gaussian', 
                   n.trees = param_df_gbm[i,'ntree'], 
                   interaction.depth = param_df_gbm[i,'interaction.depth'], 
                   shrinkage = param_df_gbm[i,'shrinkage'])

    pred_test <- predict(gbm_model,newdata = test, n.trees = param_df_gbm[i,'ntree'])
    temp_test_mse <- temp_test_mse + sum((pred_test - test$Max_Partners)^2/nrow(test), na.rm = TRUE)
  }
  list_return$test_mse <- temp_test_mse / 5
  
  return(list_return)
}

###### WARNING DON'T RUN THIS CODE UNLESS YOU HAVE A LOT OF CORES
result_list_gbm <- mclapply(X = 1:nrow(param_df_gbm), FUN = gbm_run, mc.cores = 24)
# Gathering results
model_vec <- c()
test_mse_vec <- c()
train_mse_vec <- c()
for (i in 1:length(result_list_gbm)){
  model_vec <- c(model_vec, result_list_gbm[[i]][['model']])
  test_mse_vec <- c(test_mse_vec, result_list_gbm[[i]][['test_mse']])
  train_mse_vec <- c(train_mse_vec, result_list_gbm[[i]][['train_mse']])
}
decision_df_gbm <- param_df_gbm
decision_df_gbm['test_mse'] <- test_mse_vec
decision_df_gbm['train_mse'] <- train_mse_vec

my_colors = brewer.pal(n = 11, "RdBu")[c(1:4,8:11)] #there are 9, I exluded the two lighter hues

gbm_plot <- ggplot(data = decision_df_gbm %>% mutate(interaction.depth = as.factor(interaction.depth)), aes( x = ntree, y = test_mse, color =  interaction.depth)) + 
  geom_line() + xlab("Number of Trees") + ylab("CV Test MSE") +
  ggtitle("GBM Hyperparameter Analysis") + facet_wrap(~shrinkage, labeller = label_both) + scale_color_manual(values = my_colors)
ggsave('gbm_plot.png',gbm_plot, height = 4, width = 5)

gbm_plot

gbm_plot_zoom <- ggplot(data = decision_df_gbm %>% mutate(interaction.depth = as.factor(interaction.depth)) %>% filter(shrinkage %in% c(0.1,0.01)), aes( x = ntree, y = test_mse, color =  interaction.depth)) + 
  geom_line() + xlab("Number of Trees") + ylab("CV Test MSE") +
  ggtitle("GBM Zoom") + scale_color_manual(values = my_colors) + facet_wrap(~shrinkage, labeller = label_both) + ylim(200,360) + 
  geom_point(data = data.frame(y = 76.21, x = 1900), aes(x = x, y = y), color = "red", size = 5, shape = 2)+ theme(legend.position = "none")
gbm_plot_zoom
## Warning: Removed 515 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).

ggsave('gbm_plot_zoom.png',gbm_plot_zoom, height = 4, width = 5)
## Warning: Removed 515 rows containing missing values (geom_path).

## Warning: Removed 2 rows containing missing values (geom_point).
  mergedata <- read.csv("merged.csv")
  set.seed(123)
  splitIndex <- createDataPartition(y=1:nrow(mergedata),p=0.75,list=FALSE)
  test  <- mergedata[-splitIndex,]
  train <- mergedata[splitIndex,]


  gbm_model <- gbm(Max_Partners ~ ., 
                   data = train %>% select(-Country, -life_exp_male, -life_expectancy_fe), 
                   distribution = 'gaussian', 
                   n.trees = 2000, 
                   interaction.depth = 8, 
                   shrinkage = 0.01)

summary_gbm_importance <- summary(gbm_model)

gbm_importance <- ggplot(summary_gbm_importance %>% filter(rel.inf > 1), aes(x = reorder(var, rel.inf),y=rel.inf)) + geom_col(fill = 'darkblue') + coord_flip() + ylab("Relative Importance") + xlab("Variable") + ggtitle("Variable Importance in Best GBM Model")
ggsave('gbm_plot_importance.png',gbm_importance, height = 4, width = 5)